Adapted from the course material provided by the Institute for Systems Biology, Seattle, Washington, USA.

Introduction

This project uses gene expression data from the Cancer Genome Atlas (TGCA) Pan-Cancer Atlas Consortium.

1.1 Load packages and data for analysis

# Load packages for analysis 
packages <- c(
  # set of packages for easy manipulation of data (Author: Hadley Wickam http://hadley.nz/)
  "tidyverse", 
  "magrittr",
  "ConsensusClusterPlus",
  "sigclust", 
  "cluster",
  "RColorBrewer", # library for east access to multiple colors for plotting
  "colorRamps",
  "pheatmap",
  "gplots",
  "survival",
  "GEOquery",
  "Rtsne",
  "knitr"
)

# Check that all required packages are installed i.e. require("ConsensusClusterPlus") and loaded
# sapply -- apply a function to all the elements of a vector like object
sapply(X = packages, FUN = require, character.only = TRUE, quietly=TRUE)
##            tidyverse             magrittr ConsensusClusterPlus 
##                 TRUE                 TRUE                 TRUE 
##             sigclust              cluster         RColorBrewer 
##                 TRUE                 TRUE                 TRUE 
##           colorRamps             pheatmap               gplots 
##                 TRUE                 TRUE                 TRUE 
##             survival             GEOquery                Rtsne 
##                 TRUE                 TRUE                 TRUE 
##                knitr 
##                 TRUE
# data directory
dataDirectory <- "data"

# results directory
resultsDirectory <- "results"

dataFilePath <- file.path(dataDirectory, "pancan_dataset", "BRCA", "gexp_BRCA_pancan.csv")
print(dataFilePath)
## [1] "data/pancan_dataset/BRCA/gexp_BRCA_pancan.csv"
# load file into environment
gexp = read.csv(dataFilePath, header=T, row.names=1)  # baseR

1.2 Exploratory Data Analysis

Perform some standard exploratory data analysis and perform quick quality control checks by viewing the data.

# View snippet of the expression matrix
gexp[1:10,1:5]
##        TCGA.3C.AAAU TCGA.3C.AALI TCGA.3C.AALJ TCGA.3C.AALK TCGA.4H.AAAK
## COL1A1     55574.20   179261.000    180821.00     434557.0     413984.0
## FN1        46116.70   106743.000    120646.00     136341.0     244774.0
## COL1A2     34666.80    98721.600     94717.10     236808.0     298457.0
## COL3A1     31598.70    71842.800     69689.00     196171.0     239839.0
## ACTB       41867.10   107063.000    110079.00      98775.8      94600.9
## EEF1A1     30241.40    32369.800     57191.30     101169.0      89656.2
## SPARC      21273.00    41839.600     50817.80     115850.0     127788.0
## MGP         1409.65      742.251      7568.45      44747.6      40891.5
## XBP1       39531.00    33026.100     18007.30      66764.6      52518.3
## ACTG1      53008.50    91177.800    101164.00     177436.0      79073.2

Check that there are no NA values.

print(sum(colSums(is.na(gexp))))
## [1] 0

How many samples are in the dataset?

print(paste0("Number of samples: ", dim(gexp)[[2]]))
## [1] "Number of samples: 1082"
print(paste0("Number of genes: ", dim(gexp)[[1]]))
## [1] "Number of genes: 10000"

Visualize the distribution of expression values for each sample by plotting boxplots and density plots to check for outliers and see the overall distribution of the data.

# Boxplots of expression values to check data normlization
# Create a random set of n samples
n <- 50
gexp_rand_sample <- gexp[,sample(1:dim(gexp)[2], n)]  

# convert data to long format
# %>% Pipe operator - used to chain commands together
gexp_long <- gexp_rand_sample %>% 
  mutate(GeneName=rownames(gexp_rand_sample)) %>% 
  gather("TGCA_Participant_Barcode", "Expression_Level", 1:dim(gexp_rand_sample)[[2]])
gexp_long[1:10,]
##    GeneName TGCA_Participant_Barcode Expression_Level
## 1    COL1A1             TCGA.OL.A5RW        68900.600
## 2       FN1             TCGA.OL.A5RW        34764.400
## 3    COL1A2             TCGA.OL.A5RW        42684.100
## 4    COL3A1             TCGA.OL.A5RW        31610.200
## 5      ACTB             TCGA.OL.A5RW       114515.000
## 6    EEF1A1             TCGA.OL.A5RW        67054.200
## 7     SPARC             TCGA.OL.A5RW        26879.000
## 8       MGP             TCGA.OL.A5RW          557.509
## 9      XBP1             TCGA.OL.A5RW         2060.770
## 10    ACTG1             TCGA.OL.A5RW        48464.100
# Visualize using a boxplot with ggplot
gexp_long %>% 
  ggplot(aes(x=TGCA_Participant_Barcode, y=Expression_Level)) + 
  geom_boxplot() + # create boxplot using ggplot
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) # rotate x-axis label

gexp_long %>% 
  ggplot(aes(x=Expression_Level, color=TGCA_Participant_Barcode)) + 
  geom_density() +
  labs(title=paste0("mRNA Expression Distribution in Cohort\n Random Sample (n=", n,  ")"))  +
  theme(legend.position="none") + 
  xlim(0,3000)
## Warning: Removed 61686 rows containing non-finite values (stat_density).

Clearly the data is heavily skewed. Perform a log transformation to scale the data, and then median center each gene’s expression for use in consensus clustering by subtracting the median expression of each gene (row median) from expression in each sample. This mitigates magnitude differences between genes and sets the median to 0 for all genes.

# Perform log transform
gexp_log <- log2(gexp+1)

# Median center data
gexp_log_centered = sweep(gexp_log,1,apply(gexp_log,1,median,na.rm=T)) 

# Plot new density plot with transformed data
gexp_log_sample <- log2(gexp_rand_sample + 1)
gexp_log_centered_sample = sweep(gexp_log_sample,1,apply(gexp_log_sample,1,median,na.rm=T)) 

gexp_log_centered_sample %>% 
  mutate(GeneName=rownames(gexp_log_centered_sample)) %>% 
  gather("TGCA_Participant_Barcode", "Expression_Level", 1:dim(gexp_log_centered_sample)[[2]])%>% 
  ggplot(aes(x=Expression_Level, color=TGCA_Participant_Barcode)) + 
  geom_density() +
  labs(title=paste0("mRNA Expression Distribution in Cohort\n Random Sample (n=", n,  ")"))  +
  theme(legend.position="none")

1.3 Create training and testing cohort

Partition the cohort into a training and testing dataset.

set.seed(10)
hold_out = sample(colnames(gexp_log_centered), size = 350)
gexp_testing = gexp_log_centered[,hold_out] # testing cohort
dim(gexp_testing)
## [1] 10000   350
gexp_training = gexp_log_centered[,which(!(colnames(gexp_log_centered)%in%hold_out))] # training cohort
dim(gexp_training)
## [1] 10000   732

1.4 Feature Selection

Here, we will use median absolute deviation (MAD) for feature selection.

# Calculate the MAD for each gene in the training dataset
mads = apply(gexp_training,1,mad)
head(mads[order(mads, decreasing = T)])
##  SCGB2A2   HMGCS2   CYP4Z1  SCGB1D2 CYP2B7P1      PIP 
## 5.824316 5.215207 5.095460 5.094898 4.949365 4.592107
tail(mads[order(mads, decreasing = T)])
##      RAF1    HNRNPK   KHDRBS1     WDR33     CIAO1    TARDBP 
## 0.2581625 0.2450576 0.2247351 0.2210807 0.2199930 0.2018590

Select the most variable genes.

# select top k genes
top_n <- 2000

# subset matrix -- get the most variable genes (more informative for clustering)
gexp_training_most_var = gexp_training[order(mads,decreasing=T)[1:top_n],]
gexp_testing_most_var = gexp_testing[order(mads,decreasing=T)[1:top_n],]

1.5 Consensus Clustering to Discover Disease Subtypes

Apply the function ConsensusClusterPlus to carry out consensus clustering for k 2……9.

For this analysis there are many parameter choices but the main ones are the following: - clustering algorithm - clusterAlg + ‘hc’ heirarchical (hclust) + ‘pam’ for paritioning around medoids + ‘km’ for k-means on data matrix + ‘kmdist’ for k-means on distance matrices or a function that returns a clustering. - distance metric - distance + ‘pearson’: (1 - Pearson correlation) + ‘spearman’ (1 - Spearman correlation) + ‘euclidean’ + ‘binary’ + ‘maximum’ + ‘canberra’ + ‘minkowski’ + custom distance function - linkage - the agglomeration method to be used + ward.D + ward.D2 + single + complete + average=UPGMA + mcquitty=WPGMA + median=WPGMC + centroid=UPGMC

#pdf(file.path(resultsDirectory, 'consensusClustering_training_brca.pdf')) # results stored here
#title='mRNA Expression Clustering of TCGA Breast Cancer Dataset'
results = ConsensusClusterPlus(as.matrix(gexp_training_most_var), maxK=9, reps=100, pItem=0.8, pFeature=1, clusterAlg="hc", distance="pearson", innerLinkage ="average" , finalLinkage = "average", seed=10)
## end fraction
## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

#dev.off()

1.6 Significance of Subtype Separation

Next we perform analyses to determine if the discovered clusters are significantly separable using sigclust. Sigclust analyzes whether clusters are really there, using the 2-means (k = 2) clustering index as a statistic. It assesses the significance of clustering by simulation from a single null Gaussian distribution.

# Calculate pairwise significance of clusters for k clusters using sigclust 
pairwise_cluster_significance <- function(k, expr_mat, cluster_labels){
  #oldw <- getOption("warn") # turn off warnings for MLE estimation
  #options(warn = -1)
  result_mat <- matrix(nrow = k, ncol = k, dimnames=list(1:k,1:k))
  corr_mat = cor(as.matrix(expr_mat),method = "pearson", use="pairwise.complete.obs") 
  
  for(i in 1:k) {
    for(j in i:k) {
      if(!i==j) {
        tmp1 = names(which(cluster_labels==i)) # samples in cluster i
        tmp2 = names(which(cluster_labels==j)) # samples in cluster j
        lab1 = c(rep(1,length(tmp1)),rep(2,length(tmp2))) # label vector
        
        # alternatively use correlation matrix in the sigclust function
        tmp3 = sigclust(corr_mat[c(tmp1,tmp2),c(tmp1,tmp2)],nsim=100,labflag=1,label=lab1, 
                        icovest = 3)
        
        result_mat[i,j] = tmp3@pvalnorm
      }
    }
  }
  return(result_mat)
} 

# correlation matrix for samples
corr_mat = cor(as.matrix(gexp_training_most_var), method = "pearson") # distance metric used for clustering
dissimilarity <- 1 - corr_mat

# Calculate pairwise significance of clusters for 3 clusters
k <- 3

# extract consensus clustering results - cluster labels
clusts = results[[k]]$consensusClass[colnames(gexp_training_most_var)]
head(clusts) # vector with sample cluster membership for k solution
## TCGA.3C.AAAU TCGA.3C.AALI TCGA.3C.AALJ TCGA.4H.AAAK TCGA.5L.AAT0 
##            1            2            1            3            3 
## TCGA.5T.A9QA 
##            1
# Assess signficance of cluster separability
sigclust_mat_k3 <- pairwise_cluster_significance(k=3, expr_mat=gexp_training_most_var, cluster_labels = clusts)
sigclust_mat_k3
##    1             2             3
## 1 NA 3.477327e-191  7.391862e-51
## 2 NA            NA 2.948449e-251
## 3 NA            NA            NA
image(x=1:3, y=1:3, sigclust_mat_k3, zlim=c(0,1), col=colorpanel(256,'blue','black','yellow'), 
      main='3 clusters', xlab='k', ylab='k')

# Compute silhouette information for 3 clusters
silhouette_k3 <- silhouette(x=clusts, dist = dissimilarity) # distance metric used in clustering)
plot(silhouette_k3, col = RColorBrewer::brewer.pal(3, "Set1"), border=NA, main="Silhouette plot of K=3", cex.names = 0.8)

# Repeat assessments for k 4,5
# Calculate significance of cluster separation for 4 clusters
k <- 4
clusts = results[[k]]$consensusClass[colnames(gexp_training_most_var)]
sigclust_mat_k4 <- pairwise_cluster_significance(k=4, expr_mat=gexp_training_most_var, cluster_labels = clusts)

# Compute silhouette information for 4 clusters
silhouette_k4 <- silhouette(x=clusts, dist = dissimilarity)

# Calculate significance of cluster separation for 5 clusters
k <- 5
clusts = results[[k]]$consensusClass[colnames(gexp_training_most_var)]
sigclust_mat_k5 <- pairwise_cluster_significance(k=5, expr_mat=gexp_training_most_var, cluster_labels = clusts)

silhouette_k5 <- silhouette(x=clusts, dist = dissimilarity) 

# Summary plot sigclust results
#pdf(file.path(resultsDirectory, 'training_sigclust.pdf'))
par(mfrow=c(2,2))
image(x=1:3,y=1:3, sigclust_mat_k3, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main='3 clusters',xlab='k',ylab='k')
image(x=1:4,y=1:4, sigclust_mat_k4, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main='4 clusters',xlab='k',ylab='k')
image(x=1:5,y=1:5,  sigclust_mat_k5, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main='5 clusters',xlab='k',ylab='k')
image(matrix(data=seq(from=0,to=1,length.out=10),nrow=10,ncol=1),col=colorpanel(256,'blue','black','yellow'),main='Legend',axes=F,xlab='P-Value')

#axis(1)
#dev.off()

#pdf(file.path(resultsDirectory, 'training_silhouette.pdf'), width = 8, height = 11)
par(mfrow=c(3,1))
plot(silhouette_k3,  col =  RColorBrewer::brewer.pal(3, "Set1"), border=NA, 
     main="Silhouette plot of K=3", cex.names = 0.8)
plot(silhouette_k4,  col =  RColorBrewer::brewer.pal(4, "Set1"), border=NA, 
     main="Silhouette plot of K=4", cex.names = 0.8)
plot(silhouette_k5,  col =  RColorBrewer::brewer.pal(5, "Set1"), border=NA,
     main="Silhouette plot of K=5", cex.names = 0.8)

#dev.off()

1.7 Optimal clustering solution

What is the optimal cluster solution? Choose k clusters according to diagnostic plots

num_clusters <- 3
clusts = results[[num_clusters]]$consensusClass[colnames(gexp_training_most_var)]

# Plot expression heatmap for training samples and clustering solution
most_var_genes <- names(mads[order(mads,decreasing=T)[1:300]])
meta_df <- as.data.frame(clusts)
meta_df$clusts <- as.factor(meta_df$clusts)
names(meta_df) <- c("Consensus Cluster")
pheatmap::pheatmap(gexp_training_most_var[most_var_genes, names(clusts[order(clusts)])], show_rownames=FALSE, show_colnames=FALSE, 
                   annotation_col = meta_df,
                   cluster_rows=TRUE, cluster_cols=FALSE, scale="row", 
                   color = colorRamps::matlab.like(25), annotation_names_col = F,
                   main="Top 300 Most Variable Genes\n Training Cohort", 
                   filename = file.path(resultsDirectory, "heatmap_training_cohort_most_variable_expression_brca.pdf")
)

1.8 Test on hold-out cohort

# Create centroids for clustering new samples
centroids = matrix(nrow=nrow(gexp_training_most_var),ncol=num_clusters,dimnames=list(rownames(gexp_training_most_var),c(1:num_clusters)))

for(clust in 1:num_clusters) {
  tmp1 = names(which(clusts==clust))
  centroids[,clust] = apply(gexp_training_most_var[,tmp1],1,median)
}

head(centroids)
##                   1         2        3
## SCGB2A2  -0.4557219 -3.925706 2.750578
## HMGCS2   -0.8658018 -3.768695 2.448184
## CYP4Z1   -0.7187862 -3.031524 2.808021
## SCGB1D2  -0.2963639 -3.685027 2.155398
## CYP2B7P1  1.5957222 -6.126503 1.247740
## PIP      -0.6098934 -3.255275 1.955698

To assign testing set samples to a learned cluster, we find the correlation between each sample and the cluster centroids

corr_mat_testing = cor(cbind(centroids, gexp_training_most_var),method='pearson', use= "pairwise.complete.obs")[-c(1:num_clusters),1:num_clusters]

# assign each sample (row) to the nearest centroid (i.e. the one with highest correlation)
clusts_testing= sapply(1:nrow(corr_mat_testing), function(x) { which(corr_mat_testing[x,]==max(corr_mat_testing[x,])) })
names(clusts_testing) = rownames(corr_mat_testing)
head(clusts_testing)
## TCGA.3C.AAAU TCGA.3C.AALI TCGA.3C.AALJ TCGA.4H.AAAK TCGA.5L.AAT0 
##            1            2            1            3            3 
## TCGA.5T.A9QA 
##            1

How well are the testing set samples represented in the clustering solution?

# Calculate pairwise significance of clusters for k clusters
sigclust_testing <- pairwise_cluster_significance(k = num_clusters, expr_mat = gexp_training_most_var, cluster_labels = clusts_testing)

# Compute silhouette information for clustering in 5 clusters
# correlation matrix for samples
corr_mat = cor(as.matrix(gexp_training_most_var), method = "pearson") # distance metric used for clustering
dissimilarity <- 1 - corr_mat
silhouette_testing <- silhouette(x=clusts_testing, dist = dissimilarity) # distance metric used in clustering)

# Plot significance of separability on hold-out test cohort
#pdf(file.path(resultsDirectory, 'testing_sigclust_silhouette_lung.pdf'))
par(mfrow=c(2,2))
image(x=1:num_clusters,y=1:num_clusters, sigclust_testing, zlim=c(0,1),col=colorpanel(256,'blue','black','yellow'),main=paste0(num_clusters, ' clusters'),xlab='k',ylab='k')
image(matrix(data=seq(from=0,to=1,length.out=100),nrow=10,ncol=1),col=colorpanel(256,'blue','black','yellow'),main='Legend',axes=F,xlab='P-Value')
axis(1)
plot(silhouette_testing,  col =  RColorBrewer::brewer.pal(num_clusters, "Set2"), main="Silhouette Plot of Testing Cohort", border=NA)
#dev.off()

# Write out the subtype definitions for TCGA discovery and testing cohorts
write.csv(cbind(names(clusts),clusts),row.names=F, file.path(resultsDirectory, 'tcga_brca_training_clusters.csv'))
write.csv(cbind(names(clusts_testing),clusts_testing),row.names=F, file.path(resultsDirectory, 'tcga_brca_testing_clusters.csv'))